library(tidyverse)
## ── Attaching packages ────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 3.1.0 ✔ purrr 0.2.5
## ✔ tibble 2.0.1 ✔ dplyr 0.7.8
## ✔ tidyr 0.8.3 ✔ stringr 1.3.1
## ✔ readr 1.3.1 ✔ forcats 0.3.0
## ── Conflicts ───────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
knitr::opts_chunk$set(fig.width = 15, fig.height=10) # change figure size in Rmd/HTML
# Load data from CSV
data <- read.csv('MinimumWageData.csv')
names(data)
## [1] "Year" "State" "Table_Data" "Footnote" "High.Value"
## [6] "Low.Value" "CPI.Average" "High.2018" "Low.2018"
head(data, 2)
## Year State Table_Data Footnote High.Value Low.Value CPI.Average
## 1 1968 Alabama ... 0.0 0.0 34.78333
## 2 1968 Alaska 2.1 2.1 2.1 34.78333
## High.2018 Low.2018
## 1 0.00 0.00
## 2 15.12 15.12
# Logical to delete Table_Data and Footnote columns, these are just notes and not useful
drops <- c("Table_Data", "Footnote")
data <- data[, !(names(data) %in% drops)]
head(data, 3)
## Year State High.Value Low.Value CPI.Average High.2018 Low.2018
## 1 1968 Alabama 0.00 0.000 34.78333 0.00 0.00
## 2 1968 Alaska 2.10 2.100 34.78333 15.12 15.12
## 3 1968 Arizona 0.66 0.468 34.78333 4.75 3.37
# See if data types in columns are logical
str(data)
## 'data.frame': 2750 obs. of 7 variables:
## $ Year : int 1968 1968 1968 1968 1968 1968 1968 1968 1968 1968 ...
## $ State : Factor w/ 55 levels "Alabama","Alaska",..: 1 2 3 4 5 6 7 8 9 10 ...
## $ High.Value : num 0 2.1 0.66 0.156 1.65 ...
## $ Low.Value : num 0 2.1 0.468 0.156 1.65 ...
## $ CPI.Average: num 34.8 34.8 34.8 34.8 34.8 ...
## $ High.2018 : num 0 15.12 4.75 1.12 11.88 ...
## $ Low.2018 : num 0 15.12 3.37 1.12 11.88 ...
The data was already cleaned on Kaggle, so the cleaning is very minimal. Outside of deleting the two columns that did not contain valuable information, the NA values, data types, factoring, etc. was already done.
# For graphing, I will divide the data into pre-1990 and 1990-2017
pre1990 <- filter(data, Year < 1990)
post1990 <- filter(data, Year >= 1990)
# Breakdown of data from each year
p <- ggplot(pre1990, aes(x=Year)) + geom_histogram(color="black", fill="white", binwidth=1)
p <- p + xlab("Year") + ylab("Count of Data Points") + ggtitle("Data Points by Year (Pre-1990)")
p
p <- ggplot(post1990, aes(x=Year)) + geom_histogram(color="black", fill="gray", binwidth=1)
p <- p + xlab("Year") + ylab("Count of Data Points") + ggtitle("Data Points by Year (Post-1990)")
p
# These histograms show that the data for each year is all even, so the comparisons should be fairly accurate
# We will look at the spread of High.Value and Low.Value for each state by year
p <- ggplot(pre1990, aes(x=Year, y=High.Value, color=State)) + geom_boxplot(outlier.color="red", outlier.shape=8, outlier.size=2)
p <- p + xlab("Year") + ylab("Hourly Wage ($)") + ggtitle("Highest Minimum Wage Across US States pre-1990")
p <- p + facet_wrap(pre1990$State) + theme(legend.position="none")
p
## Warning: Removed 11 rows containing non-finite values (stat_boxplot).
p <- ggplot(pre1990, aes(x=Year, y=Low.Value, color=State)) + geom_boxplot(outlier.color="red", outlier.shape=8, outlier.size=2)
p <- p + xlab("Year") + ylab("Hourly Wage ($)") + ggtitle("Lowest Minimum Wage Across US States pre-1990")
p <- p + facet_wrap(pre1990$State) + theme(legend.position="none")
p
## Warning: Removed 11 rows containing non-finite values (stat_boxplot).
p <- ggplot(post1990, aes(x=Year, y=High.Value, color=State)) + geom_boxplot(outlier.color="red", outlier.shape=8, outlier.size=2)
p <- p + xlab("Year") + ylab("Hourly Wage ($)") + ggtitle("Highest Minimum Wage Across US States post-1990")
p <- p + facet_wrap(post1990$State) + theme(legend.position="none")
p
p <- ggplot(post1990, aes(x=Year, y=Low.Value, color=State)) + geom_boxplot(outlier.color="red", outlier.shape=8, outlier.size=2)
p <- p + xlab("Year") + ylab("Hourly Wage ($)") + ggtitle("Lowest Minimum Wage Across US States post-1990")
p <- p + facet_wrap(post1990$State) + theme(legend.position="none")
p
# Look at trend of High.Value and Low.Value across the years overall
p <- ggplot(pre1990, aes(x=Year, y=High.Value)) + geom_violin(trim = FALSE)
p <- p + stat_summary(fun.y=median, geom="line", shape=18, size=2, color="red")
## Warning: Ignoring unknown parameters: shape
p <- p + ylab("High Minimum Wage ($)") + xlab("Year") + ggtitle("High Minimum Wage Trend pre-1990")
p
## Warning: Removed 11 rows containing non-finite values (stat_ydensity).
## Warning: Removed 11 rows containing non-finite values (stat_summary).
p <- ggplot(pre1990, aes(x=Year, y=Low.Value)) + geom_violin(trim = FALSE)
p <- p + stat_summary(fun.y=median, geom="line", shape=18, size=2, color="blue")
## Warning: Ignoring unknown parameters: shape
p <- p + ylab("Low Minimum Wage ($)") + xlab("Year") + ggtitle("Low Minimum Wage Trend pre-1990")
p
## Warning: Removed 11 rows containing non-finite values (stat_ydensity).
## Warning: Removed 11 rows containing non-finite values (stat_summary).
p <- ggplot(post1990, aes(x=Year, y=High.Value)) + geom_violin(trim = FALSE)
p <- p + stat_summary(fun.y=median, geom="line", shape=18, size=2, color="green")
## Warning: Ignoring unknown parameters: shape
p <- p + ylab("High Minimum Wage ($)") + xlab("Year") + ggtitle("High Minimum Wage Trend post-1990")
p
p <- ggplot(post1990, aes(x=Year, y=Low.Value)) + geom_violin(trim = FALSE)
p <- p + stat_summary(fun.y=median, geom="line", shape=18, size=2, color="purple")
## Warning: Ignoring unknown parameters: shape
p <- p + ylab("Low Minimum Wage ($)") + xlab("Year") + ggtitle("Low Minimum Wage Trend post-1990")
p
# We should look at High.2018 and Low.2018 data in a similar way, but I left off here because it is almost midnight
All states over time – ggplotly
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
p2 <- ggplot(data, aes(x=Year, y=High.Value, by=State)) + geom_line(aes(col=State))
ggplotly(p2)
How do states compare with the Federal wage? Which states closely follow it? Which states are above? Below?
library(dplyr)
#create new column with difference from Federal of the same year
fed <- data %>% filter(State == 'Federal (FLSA)') %>% select(Year, High.Value) #just Fed data
fed$fed.High.Value <- fed$High.Value #duplicates column with new name
fed <- fed %>% select(Year, fed.High.Value) #drops old name column (because I couldn't figure out an easier way)
eras <- fed %>% dplyr::group_by(fed.High.Value) %>% mutate(xmin=min(Year), xmax=max(Year)) #adds col for first and last year
data2 <- data %>% group_by(State) %>% mutate(fed_diff = High.Value - fed$fed.High.Value)
newdata <- merge(x=data2, y=eras)
eralines <- geom_vline(xintercept = newdata$xmin, alpha=0.25)
eralabels <- geom_text(data=newdata, x=newdata$xmin, y=4, label=newdata$fed.High.Value, angle=90, size=3, hjust=0.2)
zeroline <- geom_hline(yintercept = 0)
plotlabs <- labs(title = "State Minimum Wage -- difference from Federal minimum wage",
y = 'Difference ($)')
p3 <- ggplot(newdata, aes(x=Year, y=fed_diff, by=State)) + geom_line(aes(col=State))
ggplotly(p3 + eralines + zeroline + eralabels + plotlabs)
p3 + eralines + eralabels + zeroline + plotlabs + theme(legend.position = "none")
## Warning: Removed 11 rows containing missing values (geom_path).
How much is the minimum wage worth in 2018 dollars?
eralines <- geom_vline(xintercept = newdata$xmin, alpha=0.25)
eralabels <- geom_text(data=newdata, x=newdata$xmin, y=15, label=newdata$fed.High.Value, angle=90, size=3, hjust=0.3)
fedline <- geom_line(data = (newdata %>% filter(State=='Federal (FLSA)')), col="black", size=1.2)
plotlabs <- labs(title = "State Minimum Wages in 2018 Dollars",
y = '($)')
p4 <- ggplot(newdata, aes(x=Year, y=High.2018, by=State)) + geom_line(aes(col=State))
ggplotly(p4 + eralines + fedline + eralabels + plotlabs)
p4 + eralines + eralabels + plotlabs + fedline + theme(legend.position = "none")
## Warning: Removed 11 rows containing missing values (geom_path).
p5 <- ggplot(newdata, aes(x=Year, y=CPI.Average/High.2018, by=State)) + geom_line(aes(col=State))
eralines <- geom_vline(xintercept = newdata$xmin, alpha=0.25)
eralabels <- geom_text(data=newdata, x=newdata$xmin, y=15, label=newdata$fed.High.Value, angle=90, size=3, hjust=0.3)
fedline <- geom_line(data = (newdata %>% filter(State=='Federal (FLSA)')), col="black", size=1.2)
plotlabs <- labs(title = "Hours of min. wage work needed to meet CPI",
y = 'Hours')
ggplotly(p5 + eralines + fedline + eralabels + plotlabs)
p5 + eralines + eralabels + plotlabs + theme(legend.position = "none")
## Warning: Removed 11 rows containing missing values (geom_path).
Show a graphical summary of minimum wage growth or decline to keep up with inflation Federal, Iowa, California, DC, etc.
p6.2018 <- ggplot(newdata %>% filter(State=='Federal (FLSA)' | State=='Iowa') %>% filter(Year >1990),
aes(x=Year, y=High.2018, by=State)) + geom_line(aes(col=State)) + geom_point(aes(col=State))
eralines <- geom_vline(xintercept = newdata$xmin, alpha=0.25)
p6.real <- geom_line(data = newdata %>% filter(State=='Iowa' | State =='Federal (FLSA)') %>% filter(Year>1990), aes(x=Year, y=High.Value, col=State), linetype="dashed")
p6.real.points <- geom_point(data = newdata %>% filter(State=='Iowa' | State =='Federal (FLSA)') %>% filter(Year>1990), aes(x=Year, y=High.Value, col=State))
plotlabs <- labs(title = "A Look at Inflation",
y = 'Wage')
ggplotly(p6.2018 + p6.real + plotlabs)
p6.2018 + p6.real + p6.real.points + plotlabs